*********************************************************************************************************************
* JOB: 			Create the main sample 
* PAPER: 		Cybersecurity Risk
* INPUT: 		Compustat and other files
* OUTPUT: 		Sample_Final
*
* DESCRIPTION:	This program creates the main panel dataset for the analysis  
*
*********************************************************************************************************************;

/* Libraries */
libname raw_data 	'g:\Research\databases\compustat';
libname ciklink		'g:\Research\papers\cyber risk\cik_cusip linktable';

/* Macros */
%include "g:\Research\SAS Help files\fama_french_12.sas";

*********************************************************************************************************************
*
* Import data + Calculation of variables
*
*********************************************************************************************************************;

/* 1. Using 10-Ks in Edgar: General data */
%let edgarpath = g:\Cybersecurity risk\Edgar data; 
proc import out = work.edgar datafile = "&edgarpath\10_K.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;

data edgar;
	set edgar;
	/* Keep if able to read Item 1A in 10-Ks */
	if item1AUse = 1;

	/* Construction of Variables */
	/* 10K */
	lnread = log(paperTXTFileSize); 									* Log of readability;
	lndirect_indirect_s = log(1+sentencescounter);						* Log of no of cybersecurity risk disclosure sentences;
	risk_length = numRiskFactorSentencesTotal - sentencescounter;		* No of sentences in Item 1A - No of cybersecurity risk disclosure sentences;
	lnrisk_length = log(1 + risk_length); 								* Log of (No of sentences in Item 1A - No of cybersecurity risk disclosure sentences); 
	if missing(risk_length) = 0 then cikmatched = 1;					* CIK from Edgar matched with WRDS gvkey-cik linktable ;

	/* Cyber insurance */
	up_sentencesdescription = upcase(sentencesdescription);				* Cybersecurity risk disclosure text;
	insurance = 0;
	if find(up_sentencesdescription,'INSURANCE','I') then insurance = 1;

	/* Language Sentiment - relative to total number of words */ 
	sent_neg1 = negativewords / totalwords_1;							* Based on pre-defined words constructed by Loughran and McDonald;
	sent_uncert1 = uncertaintywords / totalwords_1;						* Based on pre-defined words constructed by Loughran and McDonald;
	sent_lit1 = litigiouswords / totalwords_1;							* Based on pre-defined words constructed by Loughran and McDonald;

	/* Language Sentiment - relative to total number of words (after excluding certain types of words e.g. pronouns, conjunctions, stop words etc. */ 
	sent_neg2 = negativewords / totalwordsadj;							* Based on pre-defined words constructed by Loughran and McDonald;
	sent_uncert2 = uncertaintywords / totalwordsadj;					* Based on pre-defined words constructed by Loughran and McDonald;
	sent_lit2 = litigiouswords / totalwordsadj;							* Based on pre-defined words constructed by Loughran and McDonald;

	filingdate = mdy(filingmonth, filingday, filingyear);
	datadate = mdy(report_month, report_day, report_year);

 	format datadate yymmddn.;
 	format filingdate yymmddn.;
run;
proc sort data = edgar out = edgar nodupkey;
	by cik datadate;
run;

/* 2. Using 10-Ks in Edgar: Similarity data based on the last one year attacks 
		a. Contains cosine similarity: sim_breaches_t_1_all;
		b. Jaccard similarity: simj_breaches_t_1_all */
%let simpath = g:\Cybersecurity risk\Similarity;
proc import out = work.sim_l1 datafile = "&simpath\sim_Lag1.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;

/* 3. Using 10-Ks in Edgar: Similarity data based on the last two years attacks
		a. Contains cosine similarity: sim_breaches_t_2_all;
		b. Jaccard similarity: simj_breaches_t_2_all */
proc import out = work.sim_l2 datafile = "&simpath\sim_Lag2.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;

/*	4. BoardEx: Information to calculate
		a. Presence of risk committee
		b. Independent directors */
%let other = g:\Research\Papers\Cyber risk\Other data;
proc import out = work.risk_committe_boardex datafile = "&other\risk_committe_boardex.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;
proc sort data = risk_committe_boardex nodupkey;
	by gvkey datadate;
run;

/*	5. Thomson-Reuters 13F: Information to calculate
		a. Institutional ownership */
proc import out = work.inst_ownshp datafile = "&other\inst_ownshp.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;
proc sort data = inst_ownshp nodupkey;
	by gvkey datadate;
run;

/*	6. PRC: The sample of cyberattacks matched to Compustat/CRSP - variable of interest
		a. Major_attack */
%let prcpath = g:\Cybersecurity risk\PRC data; 
proc import out = work.mbreaches_wmultiple_attacks datafile = "&prcpath\mbreaches_wmult_attacks.xlsx" dbms = 'xlsx' ;
     sheet = "sheet1"; 
     getnames = yes;
run;
data mbreaches_wmultiple_attacks;
	set mbreaches_wmultiple_attacks;
	/* Keep first attack when there are multiple attacks for the same firm within a fiscal year */
	if delete = 1 then delete;
run;

/* Keep only a major attack per firm/year */
proc sort data = mbreaches_wmultiple_attacks;
	by gvkey fyear descending major_attack;
run;
proc sort data = mbreaches_wmultiple_attacks out = unfy_breaches nodupkey;
	by gvkey fyear;
run;

/* 7. Compustat Variables 		*/

%let CompVars = gvkey
				cusip
				conm 
				cik
				datadate 
				fyear 
				fyr 
				sich
				sic
				sale   
				at     
				xrd    
				ceq    
				prcc_f 
				ajex   
				dvpsx_f
				csho   
				au     
				ppent  
				che    
				prstkc 
				pstkrv 
				ib
				dp
				oibdp
                ;

data funda; 
	set raw_data.funda (keep = &CompVars indfmt datafmt popsrc consol fyr);	
	if indfmt = 'INDL' and datafmt = 'STD' and popsrc = 'D' and consol = 'C'; 
	if fyr > 0; /* Eliminating non-accurate obs based on some WRDS reccomendations */
	end_fyr = datadate; 
	beg_fyr = intnx('month', end_fyr, -11, 'beg');
	/* Create begin and end dates for fiscal year */ 
    format end_fyr date9.
           beg_fyr date9.;
run;

/* Keep the first year a firm appears in the database - Use it later to calculate firm age */
proc sort data = funda out = fage nodupkey;
	by gvkey;
run;

/* 8. Linktable			*/

/*Sign on to WRDS */
options nomautosource noimplmac; run;
%let wrds = wrds.wharton.upenn.edu 4016;
options comamid=TCP remote=WRDS;
signon username="" password="";
rsubmit;

libname ccm '/wrds/crsp/sasdata/a_ccm';
proc download data = ccm.ccmxpf_linktable_new out = ccmxpf_linktable_new;
run;

endrsubmit;
signoff;


*********************************************************************************************************************
*
* Merge the data
*
*********************************************************************************************************************;

/* 1. Merge Compustat with CRSP */

proc sql;
	create table funda_crsp as 
    select * 
    from funda as a, ccmxpf_linktable_new (rename= (gvkey = ccm_gvkey)) as b
	where   a.gvkey = b.ccm_gvkey  
	and b.lpermno ne . 
    and b.linktype in ("LC" "LN" "LU" "LX" "LD" "LS") 
    and b.linkprim IN ("C", "P")  
    and ((a.datadate >= b.LINKDT) or b.LINKDT eq .B) and  
       ((a.datadate <= b.LINKENDDT) or b.LINKENDDT eq .E)   ;
quit;

proc sort data = funda_crsp nodupkey;
	by gvkey fyear;
run; 

/* 2. Merge the data with WRDS gvkey-cik linktable*/

proc sql;	
	create table funda_crsp_link
	as select a.*, b.cik as cik_new
	from funda_crsp as a left join ciklink.gvkey_cik_link_master as b on
	a.gvkey = b.gvkey and
	b.datadate1 <= a.datadate <= b.datadate2 and
	b.flag >= 2;
quit;

/* If historical cik is missing then use the current cik */

data funda_crsp_link;
	set funda_crsp_link;
	cik1 = cik + 0;
	cik_new1 = cik_new + 0;
	cik_new2 = cik_new1;
	if missing(cik_new1) then cik_new2 = cik;
run;

/* 3. Merge the data with other databases */

proc sql;	
	create table funda_crsp_link_m
	as select a.*, b.fyear as birth, 
	c.prcc_f as lag1_prcc_f, c.dvpsx_f as lag1_dvpsx_f, c.ajex as lag1_ajex, 
	d.lnread, d.read, d.sentencescounter as direct_indirect_s, d.lndirect_indirect_s, d.lnrisk_length, d.risk_length, d.cikmatched, d.filingdate,d.tradeSecretW5 as secrets5, d.filenamehtml, d.coname as coname_10_k, d.item1AUse, d.insurance, d.sent_neg1, d.sent_uncert1, d.sent_lit1, d.sent_neg2, d.sent_uncert2, d.sent_lit2, 
	e.risk_committe as risk_com, (e.num_neds / e._freq_) as ind_directors,
	f.inst_perblock,
	g.attack, g.major_attack, g.givenconm, g.matchedconm,  
	h.sim_breaches_t_1_all, h.simj_breaches_t_1_all, 
	i.sim_breaches_t_2_all, i.simj_breaches_t_2_all

	from funda_crsp_link as a left join fage as b on
	a.gvkey = b.gvkey 
						   left join funda_crsp as c on
	a.gvkey = c.gvkey and
	a.fyear = (c.fyear + 1)
						   left join edgar as d on
	a.cik_new2= d.cik and
	year(a.datadate) = year(d.datadate) and
	(month(d.datadate)-1) <= month(a.datadate) <= month(d.datadate) 
						   left join risk_committe_boardex as e on
	a.gvkey = e.gvkey and
	a.datadate = e.datadate
						   left join inst_ownshp as f on
	a.gvkey = f.gvkey and
	a.datadate = f.datadate
						   left join unfy_breaches as g on
	a.gvkey = g.gvkey and
	year(a.datadate) = year(g.datadate)
						   left join sim_l1 as h on
	a.gvkey = h.gvkey and
	a.fyear = h.fyear
						   left join sim_l2 as i on
	a.gvkey = i.gvkey and
	a.fyear = i.fyear;
quit;
proc sort data = funda_crsp_link_m;
	by gvkey fyear descending direct_indirect_s;
run;

proc sort data = funda_crsp_link_m nodupkey;
	by gvkey fyear;
run;

*********************************************************************************************************************
*
* Other adjustments/calculations
*
*********************************************************************************************************************;

/* 1. Adjustment: When there is no attack at t-1, use similarity calculated using t-2 information */
data sample;
	set funda_crsp_link_m;
	if sim_breaches_t_1_all = 0 and sim_breaches_t_2_all > 0 then sim_breaches_t_1_all = sim_breaches_t_2_all;
	if simj_breaches_t_1_all = 0 and simj_breaches_t_2_all > 0 then simj_breaches_t_1_all = simj_breaches_t_2_all;
run;

/* 2. Adjustment: For missing values */
data sample;
	set sample;

	if missing(xrd) then xrd = 0;		/* It may contain firm-year observations with negative values - according to wrds this is not error but likely adjustments of funds allocated in earlier years */
	if xrd < 0 then xrd = 0;		    /* We adjust the negative figures to zero */
	if missing(dvpsx_f) then dvpsx_f = 0; 
	if missing(lag1_dvpsx_f) then lag1_dvpsx_f = 0;
	if missing(prstkc) then prstkc = 0;
	if missing(pstkrv) then pstkrv = 0;

	if missing(attack) then attack = 0;
	if missing(major_attack) then major_attack = 0;

	* Other adjustments - CIK from Edgar matched with WRDS gvkey-cik linktable;
	if missing(cikmatched) then cikmatched = 0;
run;

/* 3. Create necessary lead lag variables */
proc printto print = 'rocks.expand.lst' log = 'rocks.expand.log' new;
run;
proc expand data = sample out = sample method = none;
	by gvkey;
	id fyear;
	convert prstkc    				= prstkc_lag1 					/ transform = (lag 1);
	convert pstkrv    				= pstkrv_lag1 					/ transform = (lag 1);
	convert prcc_f    				= prcc_f_lag1 					/ transform = (lag 1);

	convert direct_indirect_s   	= direct_indirect_s_lag1 		/ transform = (lag 1);

	convert sim_breaches_t_1_all	= sim_breaches_t_1_all_lag1		/ transform = (lag 1);
	convert read					= read_lag1						/ transform = (lag 1);
	convert risk_length				= risk_length_lag1				/ transform = (lag 1);

	convert attack	 				= attack_lead1					/ transform = (lead 1);
	convert major_attack	 		= major_attack_lead1			/ transform = (lead 1);

	convert datadate 				= datadate_lag1					/ transform = (lag 1);
	convert datadate 				= datadate_lag2					/ transform = (lag 2);
	convert datadate 				= datadate_lead1				/ transform = (lead 1);
	convert datadate 				= datadate_lead2				/ transform = (lead 2);

	convert filingdate 				= filingdate_lag1				/ transform = (lag 1);
	convert filingdate 				= filingdate_lag2				/ transform = (lag 2);
	convert filingdate 				= filingdate_lead1				/ transform = (lead 1);
	convert filingdate 				= filingdate_lead2				/ transform = (lead 2);

run;
proc printto print = print log = log;
run;

/* 4. Variables: Firm-level */

data sample;
	set sample;
	
	* Attack variables;
	if attack_lead1 > 0 then atleast1attack_1 = 1; else atleast1attack_1 = 0;
	if major_attack_lead1 > 0 then atleast1major_attack_1 = 1; else atleast1major_attack_1 = 0;

	* Control variables;
	bm = ceq / (prcc_f * csho);
	cash_at = che / at;
	lnage = log(1 + (fyear + 1 - birth) ); 
	lnsize = log(1 + at);
	roa = oibdp / at;
	xrd_at = xrd / at;
	tangibility = ppent / at;
	tobinsq = (at - ceq + prcc_f * csho) / at;
	return = (prcc_f/ajex  + dvpsx_f/ajex) / (lag1_prcc_f/lag1_ajex + lag1_dvpsx_f/lag1_ajex) - 1;

	cfo_at = (ib + dp - dvc) / at;

	sic_2d = substr(sic,1,2);

run;

/* 5. Variables: Industry-level */

proc printto print = 'rocks.expand.lst' log = 'rocks.expand.log' new;
run;
proc expand data = sample out = sample method = none ; 
	by gvkey;
	id fyear;
	convert cfo_at = std_cfo_at / TRANSFORMOUT = (MOVSTD 5 );
run;
proc printto print = print log = log;
run;

proc sql;
	create table sample 
	as select *, mean(std_cfo_at) as ind_std_cfo_at
	from sample
	group by sic_2d, fyear
	order by gvkey, fyear;
quit;

/* 6. Create necessary lead lag variables */

proc printto print = 'rocks.expand.lst' log = 'rocks.expand.log' new;
run;
proc expand data = sample out = sample method = none;
	by gvkey;
	id fyear;
	convert ind_std_cfo_at  	= ind_std_cfo_at_lag1 		/ transform = (lag 1);
	convert lnage    			= lnage_lag1 				/ transform = (lag 1);
	convert lnsize    			= lnsize_lag1 				/ transform = (lag 1);
	convert ind_directors    	= ind_directors_lag1 		/ transform = (lag 1);
	convert inst_perblock   	= inst_perblock_lag1		/ transform = (lag 1);
	convert inst_per13f			= inst_per13f_lag1			/ transform = (lag 1);
	convert lnread    			= lnread_lag1 				/ transform = (lag 1);
	convert risk_com    		= risk_com_lag1 			/ transform = (lag 1);
	convert lnrisk_length   	= lnrisk_length_lag1 		/ transform = (lag 1);
	convert risk_length    		= risk_length_lag1 			/ transform = (lag 1);
	convert roa 	   			= roa_lag1 					/ transform = (lag 1);
	convert xrd_at    			= xrd_at_lag1 				/ transform = (lag 1);
	convert secrets5    		= secrets5_lag1 			/ transform = (lag 1);
	convert tangibility 		= tangibility_lag1 			/ transform = (lag 1);
	convert tobinsq    			= tobinsq_lag1 				/ transform = (lag 1);

run;
proc printto print = print log = log;
run;

/* 7. Create industry dummies */

%ind_ff12(dset = sample, outp = sample, sic = sich, ind_code = ind_ff12);

/* 8. Final sample */

data sample_final (keep = gvkey fyear ind_ff12   sim_breaches_t_1_all   simj_breaches_t_1_all  
							bm  size lnsize age lnage tobinsq roa tangibility xrd_at secrets5 ind_std_cfo_at 
							risk_length   lnrisk_length   read  lnread inst_perblock ind_directors  risk_com direct_indirect_s  direct_indirect_s  risk_length  
							sent_neg2 sent_lit2  sent_uncert2 direct_indirect_s insurance atleast1attack_1 atleast1major_attack_1 atleast1nonmajor_attack_1);
	set sample;
	* Other restrictions;
	if 2005 <= fyear <= 2018;
	if cikmatched = 1;
run;

* 9. Export all data to STATA for the main analyses;
proc export data = work.sample_final
            outfile = "G:\Cybersecurity risk\Stata data\sample_final.dta"
            dbms = STATA replace;
run;


